# ==============================================================================
# file name: 01-identify-survey-hosts-YG.R
# date:	Nov 24, 2022
# author: Bernhard Clemm / Tiago Ventura 
# purpose: Identify hosts of survey sites (YouGov) with three approaches:
#          (1) questionnaire tools from Bevec & Vehovar (2021)
#          (2) all URL hosts containing "survey"
#          (3) manually coding most-visited 500 URL hosts not already in (1)/(2)
# # THIS SCRIPT REQUIRES ACCESS TO THE RAW DATA AND SERVES FOR REFERENCE ONLY
# ==============================================================================

# SETUP ========================================================================

browsing_YG <- read_rds("")

# (1) Bevec & Domen (2021) =====================================================

match_patterns <- read_file("data/raw_to_processed/url_hosts/bevec_url_matches_patterns.txt")

# match URL hosts in data to patterns and create list of unique hosts
hosts_bevec_YG <- browsing_YG %>%    
  distinct(page_domain) %>%
  filter(grepl(match_patterns, page_domain)) 

write.csv(hosts_bevec_YG, paste0(path, "data/url_hosts/hosts_bevec_YG.csv"), 
          row.names = F)

# (2) URL hosts containing "survey" ============================================

hosts_survey_YG <- browsing_YG %>%
  filter(grepl("survey", page_domain)) %>%
  distinct(page_domain)

# only keep those not already in Bevec host list
hosts_survey_YG <- setdiff(hosts_survey_YG, hosts_bevec_YG)

write.csv(hosts_survey_YG, paste0(path, "data/url_hosts/hosts_survey_YG.csv"), 
          row.names = F)

# (3) most-visited 500 =========================================================

# visits per host
browsing_hosts_YG <- browsing_YG %>%
  filter(page_domain != "") %>%
  group_by(page_domain) %>%
  summarise(count = n())

# keep only hosts not in other two lists
browsing_hosts_YG <- browsing_hosts_YG %>%
  filter(!(page_domain %in% c(hosts_survey_YG$page_domain, hosts_bevec_YG$page_domain))) %>%
  arrange(desc(count))

# Take top 500 - these will be coded manually
hosts_500_YG <- browsing_hosts_YG %>% head(500)

write.csv(hosts_500_YG, paste0(path, "data/url_hosts/hosts_500_YG.csv"), row.names = F)





